import codecs
import re


def parse_quora_dul_data(in_file):
    """解析quara 重复文档文件；
    """
    ques_pairs = []
    with codecs.open(in_file, 'r', encoding="utf-8") as fi:
        fi.readline()
        for line in fi:
            if len(line) == 0:
                continue
            splits = line.split("\t")
            if len(splits) < 6:
                continue
            pid = splits[0]
            ques1 = splits[3]
            ques2 = splits[4]
            is_dul = int(splits[5])

            # ques1_token = tokenize(ques1)
            # ques2_token = tokenize(ques2)

            # ques_pairs.append((ques1_token, ques2_token, is_dul, pid))

    return ques_pairs


def tokenize(sent):
    """切句子；
    """
    test_list = re.split('(\W+)?', sent)
    print(test_list)
    return [x.strip().lower() for x in re.split('(\W+)?', sent) if (x is not None and x.strip())]


if __name__ == '__main__':
    t_list = tokenize("你 能 帮助 介绍 我 I贷 每期 还款")
    print(t_list)

